********************************************************************************

********************************************************************************

********************************************************************************
clear all
********************************************************************************


//Load data 
////////////////////////////////////////////////////////////////////////////////


use data_adm.dta, clear


//Clean data 
////////////////////////////////////////////////////////////////////////////////


********************************************************************************
****Employment: l_under_total_n l_surface_total_n l_total_n

**Check sums:
egen testsum = rowtotal(l_under_total_n l_surface_total_n)
gen testdiff = l_total_n - testsum
tab testdiff
*1849(101)-3/38: mistake in source, prefer testsum
*1849(101)-3/40: mistake in source, prefer testsum
*1849(101)-5/15: mistake in source, prefer testsum
*1881(111)-7/11: missing in source, imputed in Excel
*1881(111)-7/multiple: unreadable in source, imputed in Excel
*****
*1846(180)-5/13: mistake in source, prefer testsum
*1846(180)-5/33: missing in source, imputed in Excel
*1847(181)-3/28: mistake in source, prefer testsum
*1847(181)-3/34: mistake in source, prefer testsum
*1847(181)-5/21: mistake in source, prefer testsum
*1847(181)-6/15: mistake in source, prefer testsum	!!!!!
*1848(182)-7/9: mistake in source, altered in Excel
*1862(191)-7/26: mistake in source, prefer testsum
*1862(191)-7/27: mistake in source, prefer testsum
*1865(194) -5/30: mistake in source, prefer testsum

/*
keep if testdiff!=0 & !missing(testdiff)

order testdiff l_under_total_n l_surface_total_n l_total_n testsum
sort series year district arrondissement ordernr
*/

replace l_total_n = testsum if testsum!=l_total_n & !missing(l_total_n) //=> 11 changes
drop testdiff testsum


********************************************************************************
****Wages: l_under_total_w l_surface_total_w

**Checked implausible outliers in l_under_total_w l_surface_total_w l_total_w
**Checked implausible outliers in l_under_total_w_gross l_under_total_w_net l_surface_total_w_gross l_surface_total_w_net l_total_w_gross l_total_w_net l_miner_total_w_gross l_miner_total_w_net
**Checked implausible outliers in l_under_men_w l_under_women_w l_under_boys_w l_under_girls_w l_surface_men_w l_surface_women_w l_surface_boys_w l_surface_girls_w


**Check sums:
gen testwmean = (l_under_total_w*l_under_total_n + l_surface_total_w*l_surface_total_n)/(l_under_total_n + l_surface_total_n)
gen testwdiff = l_total_w - testwmean
replace testwdiff = round(testwdiff,0.01)
order test*, after(l_total_w)

***keep if testwdiff>0.02 & !missing(testwdiff)
*1874(108): miscalculation in source
*1876(109): miscalculation in source
*1881(111)-6,7: miscalculation in source
*1881(111)-8/5: unclear source, probable fix in Excel
*1882(112)-7/4: miscalculation in source
*1883(113)-6,8: miscalculation in source
*1885(114A): miscalculation in source
*1891(115)-5: wrong correction in source, fixed in Excel
*****
*1873(200)-4/26: miscalculation in source
*1878(202)-4,7: miscalculation in source
*1879(203)-4/4: miscalculation in source
drop testwdiff testwmean


gen testwmean = (l_under_total_w_gross*l_under_total_n + l_surface_total_w_gross*l_surface_total_n)/(l_under_total_n + l_surface_total_n)
gen testwdiff = l_total_w_gross - testwmean
replace testwdiff = round(testwdiff,0.01)
order test*, after(l_total_w)
***keep if testwdiff>0.02 & !missing(testwdiff)
*1903(119)-6/Velaine: miscalculation in source
*1910(123)-6/Muache: miscalculation in source
drop testwdiff testwmean



gen testwmean = (l_under_total_w_net*l_under_total_n + l_surface_total_w_net*l_surface_total_n)/(l_under_total_n + l_surface_total_n)
gen testwdiff = l_total_w_net - testwmean
replace testwdiff = round(testwdiff,0.01)
order test*, after(l_total_w)
***keep if testwdiff>0.02 & !missing(testwdiff)
*1891(115)-4: miscalculation in source
*1897(117)-8: miscalculation in source
*1903(119)-6/Velaine: miscalculation in source
*1910(123)-6/Muache: miscalculation in source
drop testwdiff testwmean



********************************************************************************
*****Finances

**Check sums: expenses_l expenses_other expenses_total
egen testsum = rowtotal(expenses_l expenses_other)
gen testdiff = expenses_total - testsum
tab testdiff
***keep if abs(testdiff)>1 & !missing(testdiff)
*1849(101)-6: miscalculation in source
*1854(104)-5: miscalculation in source
*1859(105)-7/20: mistake in source, imputed in Excel
*1877(110)-7/1: multiple mistakes in source
*1895(116)-7/7: unclear source, probable fix in Excel
*1898(117)-8/5: multiple mistakes in source
*****
*1845(179)-7/19: mistake in source
*1846(180)-5/43: mistake in source
*1846(180)-6/13: mistake in source
*1847(181)-5/38: mistake in source
*1848(182)-3/10: mistake in source
*1858(188)-3/5+26: mistake in source, imputed in Excel
*1864(193)-3/38: mistake in source
*1867(195)-5/28: mistake in source
*1869(196)-7/27: mistake in source
*1871(198)-5/17: mistake in source
drop testdiff testsum



**Check sums: expenses_std_l expenses_std_other expenses_std_total
egen testsum = rowtotal(expenses_std_l expenses_std_other)
gen testdiff = expenses_std_total - testsum
tab testdiff
***keep if abs(testdiff)>1 & !missing(testdiff)
*1874(108)-5/21: miscalculation in source
*1877(110)-5/15: miscalculation in source
*1877(110)-7/1: miscalculation in source
*1881(111)-6/32: miscalculation in source
*1882(112)-7/17: unclear source, fix in Excel
*1894(116)-8/1: miscalculation in source
*1899(118)-8/16: miscalculation in source
drop testdiff testsum



**Check sums: expenses_spc_l expenses_spc_other expenses_spc_total
egen testsum = rowtotal(expenses_spc_l expenses_spc_other)
gen testdiff = expenses_spc_total - testsum
tab testdiff
***keep if abs(testdiff)>1 & !missing(testdiff)
*1876(109)-5/19: miscalculation in source
*1874(110)-6/20: missing in source, imputed in Excel
*1881(111)-8/18: unclear source, fix in Excel
*1881(111)-10/18: miscalculation in source
*1894(116)-7/5: miscalculation in source
drop testdiff testsum


**Check sums: expenses_std_total expenses_spc_total expenses_total
gen testsum = expenses_std_total + expenses_spc_total
gen testdiff = expenses_total - testsum
tab testdiff
***keep if abs(testdiff)>1 & !missing(testsum)
*1894(116)-8/1: miscalculation in source
*1899(118)-8/16: miscalculation in source
*****
*1879(203)-5: miscalculation in source
drop testdiff testsum


***************************(Extra checks:)

**Check sums: expenses_std_l expenses_spc_l expenses_l

gen testsum = expenses_std_l + expenses_spc_l
gen testdiff = expenses_l - testsum
tab testdiff
***keep if abs(testdiff)>1 & !missing(testsum) //Note: these are recurring isues
*1874(108)-5/21: miscalculation in source
*1898(117)-8/5: multiple mistakes in source
*1879(203)-5/15: miscalculation in source
drop testdiff testsum


**Check sums: expenses_std_other expenses_spc_other expenses_other
gen testsum = expenses_std_other + expenses_spc_other
gen testdiff = expenses_other - testsum
tab testdiff
***keep if abs(testdiff)>1 & !missing(testsum)  //Note: these are recurring isues
*1876(109)-5/19: miscalculation in source
*1877(110)-5/15: miscalculation in source
*1881(111)-10/18: miscalculation in source
*1894(116)-7/5: miscalculation in source
*1879(203)-5: miscalculation in source
drop testdiff testsum



********************************************************************************
*****Production

**Check sums (Template A): 

******q_leans_gr_t q_leans_glx_t q_leans_mn_t q_leans_tot_t
******q_drys_gr_t q_drys_glx_t q_drys_tot_t
******q_leanl_gr_t q_leanl_gal_t q_leanl_glx_t q_leanl_tot_t
******q_fatl_gr_t q_fatl_glx_q q_fatl_tot_t
******q_fatty_gr_t q_fatty_glx_t q_fatty_tot_t

******q_total_gr_t q_total_gal_t q_total_glx_t q_total_mn_t q_total_t
egen testsum = rowtotal(q_total_gr_t q_total_gal_t q_total_glx_t q_total_mn_t)
gen testdiff = q_total_t - testsum if testsum!=0
tab testdiff
***keep if abs(testdiff)>1 & !missing(testdiff)
*1866(106)-5/22: probable mistake in source, fixed in Excel
*1866(106)-5/33: probable mistake in source, fixed in Excel
*****
*1845(179)-5/33: probable mistake in source, fixed in Excel
*1845(179)-6/14: miscalculation in source
*1846(180)-5/24: probable mistake in source, fixed in Excel
*1847(181)-3/19: miscalculation in source
*1847(181)-5/33: probable mistake in source, fixed in Excel
*1847(181)-6/23: probable mistake in source, fixed in Excel
*1853(184)-5/6: probable mistake in source, fixed in Excel
*1855(185)-3/33: probable mistake in source, fixed in Excel
*1855(185)-5/15: probable mistake in source, fixed in Excel
*1855(185)-5/40: probable mistake in source, fixed in Excel
*1861(190)-5/39: probable mistake in source, fixed in Excel
*1869(196)-5/28: probable mistake in source, fixed in Excel
drop testdiff testsum



**Check sums (Template B): q_dry_t q_leans_t q_leanl_t q_halfl_t q_fatty_t q_total_t
egen testsum = rowtotal(q_dry_t q_leans_t q_leanl_t q_halfl_t q_fatty_t)
gen testdiff = q_total_t - testsum if testsum!=0
tab testdiff if year<1900 //OK
***keep if abs(testdiff)>1 & !missing(testdiff) & year<1900
****
*1873(200)-5/13: probable mistake in source, fixed in Excel
*1873(200)-5/19: miscalculation in source
*1880(204)-5/15: probable mistake in source, fixed in Excel
drop testdiff testsum


**Check sums (Template C): 
egen testsum = rowtotal(q_fln_t q_fat_t q_halfl_t q_lean_t)
gen testdiff = q_total_t - testsum if testsum!=0
tab testdiff if year>1899
***keep if abs(testdiff)>1 & !missing(testdiff) & year>1899
**All fixed
drop testdiff testsum

gen testp = q_fat_pq/q_fat_t
gen testdiff = q_fat_p-testp
tab testdiff
***keep if abs(testdiff)>0.1 & !missing(testdiff)
*1913(124)-8/Bonnefin Baneux: mistake in source
drop testdiff testp

gen testp = q_halfl_pq/q_halfl_t
gen testdiff = q_halfl_p-testp
tab testdiff
***keep if abs(testdiff)>0.1 & !missing(testdiff)
**All fixed
drop testdiff testp

gen testp = q_lean_pq/q_lean_t
gen testdiff = q_lean_p-testp
tab testdiff
***keep if abs(testdiff)>0.1 & !missing(testdiff)
**All fixed
drop testdiff testp


********************************************************************************
*****Revenue

**Check product for large outliers: q_total_t q_total_p q_total_pq

***Template A+B:
gen testp = q_total_pq/q_total_t
***scatter testp year, name(t1)

sum if testp>26 & !missing(testp) //OK
*1880(204)-6/12: miscalculation in source
***keep if testp<4 //OK


***Template B+C:
gen testdiff =  q_total_p - testp if !missing(testp)
tab testdiff
***keep if abs(testdiff)>0.1 & !missing(testdiff)
*1874(108)-5/18: miscalculation in source
*1881(111)-10/10: miscalculation in source
*1881(111)-10/18: miscalculation in source
*1883(113)-7/6: miscalculation in source
*1885(114)-5/10: miscalculation in source
*1893(116)-6/14: miscalculation in source
*1897(117)-8/1: miscalculation in source
*1899(118)-7/4: miscalculation in source
*****
*1873(200)-6/8: miscalculation in source
*1875(201)-5: miscalculation in source
*1878(202)-5/27: miscalculation in source
*1879(203)-5/20: probable mistake in source, fixed in Excel
*1880(204)-5/15: miscalculation in source
*1880(204)-6/12: miscalculation in source (cfr. supra)
drop testp testdiff


gen testpq = q_total_t*q_total_p
gen testdiff =  q_total_pq - testpq if !missing(testpq)
tab testdiff
***keep if abs(testdiff)>5000& !missing(testdiff)
//Impression: mostly the result of approximations
*1874(108)-5/18: checked
*1874(108)-5/30: checked
*1882(112)-7/4: checked
*1883(113)-7/6: checked
*1885(114): checked
*1891(115)-5/25: checked
*1897(117)-8/1: checked
*1898(117)-6/2: checked
*1899(118)-7/4: checked
*****
*1873(200)-6/8: checked
*1875(201)-5: checked
*1880(204)-5/15: checked
*1880(204)-6/12: miscalculation in source (cfr. supra)
drop testpq testdiff


********************************************************************************balance_gain balance_loss

gen testbalance = abs(q_total_pq-expenses_total)
egen testbalance2 = rowtotal(balance_gain balance_loss)
gen testdiff =  testbalance2 - testbalance if !missing(testbalance)
tab testdiff
***keep if abs(testdiff)>=1& !missing(testdiff) & year<1900
*1849(101)-4/1: checked
*1854(104)-5/22: checked
*1854(104)-6/21: checked
*1859(105)-6/36: checked
*1866(106)-4/36: checked
*1874(108)-5/18: checked
*1876(109)-5/6: checked
*1877(110)-7/5: checked
*1885(114B)-6: checked
*****
*1845(179)-5/24: checked
*1846(180)-5/22: checked
*1846(180)-7: checked
*1847(181)-5+6: checked
*1847(181)-7/3: checked
*1848(182)-6/42+43: checked
*1855(185)-3/38: checked
*1855(185)-5/28: checked
*1857(187)-3: checked
*1858(188)-5/38: checked
*1860(189)-6/5: checked
*1867(195)-5: checked
*1871(198)-7/9: checked
*1872(199)-5/12: checked
*1880(204)-6/6: checked
drop testbalance testbalance2 testdiff


//Save data 
////////////////////////////////////////////////////////////////////////////////


order mine_id id year
sort id year
save data_adm.dta, replace

exit
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
